Package org.terrier.indexing

Source Code of org.terrier.indexing.PDFDocument

/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.ac.uk/
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is PDFDocument.java.
*
* The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
*   Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (original author)
*/
package org.terrier.indexing;
import java.io.CharArrayReader;
import java.io.CharArrayWriter;
import java.io.InputStream;
import java.io.Reader;
import java.util.Map;

import org.apache.log4j.Logger;
import org.pdfbox.pdfparser.PDFParser;
import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;
import org.terrier.indexing.tokenisation.Tokeniser;
/**
* Implements a Document object for reading PDF documents. This object uses the
* <a href="http://www.pdfbox.org">PDFBox.org</a> library, so you'll need
* to ensure that PDFBox-0.6.7a.jar or greater is in your classpath when
* compiling or using this document. For using this class, you will also
* need the library <a href="http://logging.apache.org/log4j/">log4j</a>.
* @author Craig Macdonald
*/
public class PDFDocument extends FileDocument
{
  protected static final Logger logger = Logger.getLogger(PDFDocument.class);
  /**
   * Constructs a new PDFDocument, which will convert the docStream
   * which represents the file to a Document object from which an Indexer
   * can retrieve a stream of terms.
   * @param docStream InputStream the input stream that represents the
   *        the document's file.
   */
  public PDFDocument(String filename, InputStream docStream, Tokeniser tokeniser)
  {
    super(filename, docStream, tokeniser);
  }
  /**
   * Constructs a new PDFDocument
   * @param docStream
   * @param docProperties
   * @param tok
   */
  public PDFDocument(InputStream docStream,
      Map<String, String> docProperties, Tokeniser tok) {
    super(docStream, docProperties, tok);
  }
  /**
   * Constructs a new PDFDocument
   * @param docReader
   * @param docProperties
   * @param tok
   */
  public PDFDocument(Reader docReader, Map<String, String> docProperties,
      Tokeniser tok) {
    super(docReader, docProperties, tok);
  }
  /**
   * Constructs a new PDFDocument
   * @param filename
   * @param docReader
   * @param tok
   */
  public PDFDocument(String filename, Reader docReader, Tokeniser tok) {
    super(filename, docReader, tok);
  }
  /**
   * Returns the reader of text, which is suitable for parsing terms out of,
   * and which is created by converting the file represented by
   * parameter docStream. This method involves running the stream
   * through the PDFParser etc provided in the org.pdfbox library.
   * On error, it returns null, and sets EOD to true, so no terms
   * can be read from this document.
   * @param docStream the input stream that represents the document's file.
   * @return Reader a reader that is fed to an indexer.
   */
  protected Reader getReader(InputStream docStream)
  {
   
    PDFParser parser = null; PDDocument document = null; PDFTextStripper stripper = null;
    CharArrayWriter writer = null;
    try{
      parser = new PDFParser(docStream);
      parser.parse();
      document = parser.getPDDocument();
      writer = new CharArrayWriter();
      stripper = new PDFTextStripper();
      stripper.setLineSeparator("\n");
      stripper.writeText(document, writer);
      document.close();
      writer.close();
      parser.getDocument().close();
      return new CharArrayReader(writer.toCharArray());
    }catch (Exception e){
        //logger.warn("WARNING: Problem converting PDF: ",e);
      try{
        document.close();       
      }catch(Exception e1){
        //logger.warn("WARNING: Problem converting PDF: ",e1);
      }
      try{
        writer.close();
      }catch(Exception e2){
        //logger.warn("WARNING: Problem converting PDF: ",e2);
      }
      try{
        parser.getDocument().close();
      }catch(Exception e3){
        //logger.warn("WARNING: Problem converting PDF: ",e3); 
      }
      parser = null; document = null; writer = null; stripper = null;
      EOD=true;
      return null;
    }
  }
}
TOP

Related Classes of org.terrier.indexing.PDFDocument

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.